In [33]:
import pandas as pd
from autoc import DataExploration, PreProcessor, NaImputer
from autoc.utils.getdata import get_dataset
import numpy as np
# skicit learn
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score,train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_curve, accuracy_score, auc, classification_report
# matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
Two approaches :
In [34]:
titanic = get_dataset("titanic")
In [35]:
titanic.who.dtype.kind == 'O'
Out[35]:
In [36]:
titanic.head()
Out[36]:
In [37]:
exploration_titanic = DataExploration(titanic)
In [38]:
exploration_titanic.print_infos() # there is duplicates here because no id, interesting !
In [39]:
exploration_titanic.nacolcount()
Out[39]:
In [40]:
exploration_titanic.structure()
Out[40]:
In [41]:
titanic.corr()
Out[41]:
In [42]:
titanic.loc[titanic.age.isnull(),:].head(5)
Out[42]:
In [43]:
preprocessor = PreProcessor(titanic)
In [44]:
preprocessor.infer_subtypes()
Out[44]:
In [45]:
titanic = titanic.drop('alive', axis = 1)
The dataset is not clean enough to be directly transformed as a numpy array and used for ml with skicit learn
In [46]:
features_full = pd.concat([titanic.loc[:, ['fare', 'age', 'pclass', 'sibsp', 'parch']],
pd.get_dummies(titanic['sex'], prefix='sex'),
pd.get_dummies(titanic['who'], prefix='who'),
pd.get_dummies(titanic['alone'], prefix='alone'),
pd.get_dummies(titanic['embarked'], prefix='embarked')],
axis=1)
In [47]:
features = pd.concat([titanic[['fare', 'age', 'pclass']],
pd.get_dummies(titanic['sex'], prefix='sex'),
pd.get_dummies(titanic['who'], prefix='who'),
pd.get_dummies(titanic['embarked'], prefix='embarked')],
axis=1)
In [48]:
target = titanic.survived
In [49]:
# Impute missing values
imp = NaImputer(features_full)
features_full = imp.basic_naimputation(['age']) # this is still a pandas Dataframe but imputed
target = titanic.survived
In [50]:
# Creating test train
features_train, features_test, target_train, target_test = train_test_split(
features_full.values, target.values, test_size=0.25, random_state=0)
In [51]:
logreg = LogisticRegression(C=1)
logreg.fit(features_train, target_train)
target_pred = logreg.predict(features_test)
feature_names = features_full.columns
print("Accuracy : {}".format(accuracy_score(target_test, target_pred)))
weights = logreg.coef_.flatten()
dict_weights = {k:v for k,v in zip(feature_names, weights)}
In [52]:
def plot_simple_imp(imp, features_names,sort = True, absolute=False):
serie = pd.Series(index=feature_names, data=imp)
if absolute :
serie = np.abs(serie)
if sort :
serie.sort_values(inplace=True, ascending=False)
serie.plot(kind='barh')
In [53]:
plot_simple_imp(weights, feature_names)
In [ ]:
In [54]:
# Looking at weights
feature_names = features_full.columns
def plot_abs_weights(coeff_arr, features_name, title=None,legend_size=12,figsize=(15,7)):
coeff_arr = np.abs(coeff_arr)# take absolute value
coeff_arr.sort()
plt.figure(figsize=figsize)
plt.barh(range(len(feature_names)), coeff_arr)
plt.yticks(range(len(feature_names)), feature_names, size=legend_size)
if title:
plt.title(title)
In [55]:
plot_abs_weights(logreg.coef_.ravel(), feature_names, title="Absolute Coefficient Logistic Regression")
In [56]:
rf_full = RandomForestClassifier(n_estimators=500)
rf_full.fit(features_train, target_train)
Out[56]:
In [57]:
rf_full.score(features_test, target_test)
Out[57]:
In [58]:
plot_simple_imp(rf_full.feature_importances_, feature_names)
Here we are trying to predict who survives using with the variable age having natural missing values
In [130]:
def rf_cv(features, target,random_state=1, n_estimators=200,scoring='accuracy',n_jobs=4, verbose=True):
""" Print scores of a random forest cross validation """
rf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
scores = cross_val_score(rf, features, target, cv=4,scoring=scoring,n_jobs=4)
if verbose :
print("Random Forest CV scores:min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(
scores.min(), scores.mean(), scores.max()))
return scores
def logreg_cv(features, target, scoring='accuracy',n_jobs=4, verbose=True):
""" Print scores of a forest cross validation """
logreg = LogisticRegression(C=1)
scores = cross_val_score(logreg, features, target, cv=4,scoring=scoring,n_jobs=4)
if verbose :
print("Logistic Regression CV scores: min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(
scores.min(), scores.mean(), scores.max()))
return scores
def plot_roc_curve(target_test, target_predicted_proba):
fpr, tpr, thresholds = roc_curve(target_test, target_predicted_proba[:, 1])
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.plot(fpr, tpr, label='ROC curve (AUC = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--') # random predictions curve
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate or (1 - Specifity)')
plt.ylabel('True Positive Rate or (Sensitivity)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
In [131]:
# selecting index and transforming into numpy array
index_missing_age = features.age.isnull()
features_rm_a, target_rm_a = features.loc[~index_missing_age, :].values, target[~index_missing_age].values
In [132]:
features_rm_a.shape
Out[132]:
In [133]:
rf_cv(features_rm_a, target_rm_a, scoring='accuracy')
Out[133]:
In [134]:
features_train_rm, features_test_rm, target_train_rm, target_test_rm = train_test_split(
features_rm_a, target_rm_a, test_size=0.25, random_state=0)
In [135]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(features_train_rm, target_train_rm)
target_predicted_proba = rf.predict_proba(features_test_rm)
plot_roc_curve(target_test_rm, target_predicted_proba)
In [136]:
rf_cv(features_rm_a, target_rm_a,random_state=0, scoring='roc_auc')
Out[136]:
In [137]:
# selecting index and transforming into numpy array
features_cm_a, target_cm_a = features.drop('age', axis =1).values, target.values
In [138]:
features_cm_a.shape
Out[138]:
In [139]:
rf_cv(features_cm_a, target_cm_a, scoring="accuracy")
Out[139]:
In [140]:
features_train_cm, features_test_cm, target_train_cm, target_test_cm = train_test_split(
features_cm_a, target_cm_a, test_size=0.25, random_state=0)
In [141]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(features_train_cm, target_train_cm)
target_predicted_proba = rf.predict_proba(features_test_cm)
plot_roc_curve(target_test_cm, target_predicted_proba)
In [142]:
rf_cv(features_cm_a, target_cm_a, scoring="roc_auc")
Out[142]:
In [143]:
# selecting index and transforming into numpy array
features_imp = features.copy()
features.shape
Out[143]:
In [144]:
# features_imp.loc[:,'is_na_age'] =features_imp.age.isnull().astype(int)
# imp = NaImputer(features) # creating our imputer instance
# features_imp = imp.basic_naimputation(columns_to_process=['age'])
In [145]:
features_imp = features.fillna(-1)
In [146]:
features_imp_a, target_imp_a = features_imp.values, target.values
In [147]:
rf_cv(features_imp_a, target_imp_a, scoring='accuracy')
Out[147]:
In [148]:
features_train_imp, features_test_imp, target_train_imp, target_test_imp = train_test_split(
features_imp_a, target_imp_a, test_size=0.25, random_state=0)
In [149]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(features_train_imp, target_train_imp)
target_predicted_proba = rf.predict_proba(features_test_imp)
plot_roc_curve(target_test_imp, target_predicted_proba)
In [150]:
rf_cv(features_imp_a, target_imp_a, scoring='roc_auc')
Out[150]:
In [151]:
rf.feature_importances_
Out[151]:
The purpose of this section is to simulate missing values in a important variable like .. and see the decrease of performance and the ability to correcly impute the missing value
In [152]:
# constructing features
features_imp = pd.concat([titanic[['pclass']],
pd.get_dummies(titanic['sex'], prefix='sex'),
pd.get_dummies(titanic['who'], prefix='who')],axis=1)
In [153]:
features_imp.pclass.value_counts()
Out[153]:
In [156]:
#scores_imp = logreg_cv(features_imp.drop('pclass',1), target)
In [174]:
def insert_na(features_full=features_imp,target=target,index=False,
col_to_simulate='pclass', pct_na_toinsert=0.2, verbose=False):
""" Returns dataset with a certain pct of na injected in one colum """
nb_na_toinsert = int(pct_na_toinsert * len(features_full))
index_na_toinsert = np.random.choice(range(len(features_full)),nb_na_toinsert, replace=False)
if verbose:
print("We are inserting {} missing values".format(len(index_na_toinsert)))
features_full_imp = features_full.copy()
if index :
return index_na_toinsert
else:
features_full_imp.loc[index_na_toinsert, col_to_simulate] = np.nan
return features_full_imp
def score_rf_sim(features_full=features_imp,target=target,
col_to_simulate='pclass', pct_na_toinsert=0.2, n_repeat=10,verbose=False, *args, **kwargs):
""" Inserting a percentage of missing values on a variable and look influence on performance
with a random forest model """
features_full_imp = insert_na(features_full,target=target,
col_to_simulate=col_to_simulate, pct_na_toinsert=pct_na_toinsert,verbose=verbose)
imp_f = NaImputer(features_full_imp)
features_full_imp.loc[:,col_to_simulate] = imp_f.fillna_serie(colname=col_to_simulate)
# repeated cross validation
# score_rcv = 0
# for i in range(n_repeat):
# score_rcv += logreg_cv(features_full_imp, target,*args, **kwargs).mean()
return logreg_cv(features_full_imp, target, verbose=False).mean()
In [175]:
score_rf_sim(col_to_simulate='pclass', verbose=True)
Out[175]:
In [195]:
accuracy_mean_pct_na = np.array([score_rf_sim(
pct_na_toinsert=i,col_to_simulate='pclass',verbose=True) for i in np.linspace(0,0.98,10)])
In [179]:
def sim_nmc(nmc=60,n_interval=5, *args, **kwargs):
res = np.zeros(n_interval)
for i in range(nmc):
res += np.array([score_rf_sim(
pct_na_toinsert=i, *args, **kwargs) for i in np.linspace(0,0.98,n_interval)])
return res/nmc
In [180]:
test = sim_nmc(nmc=30, n_interval=5)
In [181]:
test
Out[181]:
In [182]:
np.linspace(0,0.98,5)
Out[182]:
In [183]:
plt.plot(np.linspace(0,0.98,5), test)
plt.title('Accuracy function of percentage of missing values inserted')
Out[183]:
We start with all features to have better prediciton power
In [184]:
features_pred = features_full.copy().drop_duplicates()
features_pred = features_pred.drop('age', axis = 1)
In [185]:
index_na = insert_na(col_to_simulate='pclass', pct_na_toinsert=0.2, index=True)
In [186]:
index_na = features_pred.index.isin(index_na)
In [187]:
features_pred.head()
Out[187]:
In [188]:
target = features_pred.pclass
features_pred = features_pred.drop('pclass', axis = 1)
In [189]:
features_pred_train, target_pred_target = features_pred.loc[~index_na,:], target[~index_na]
features_pred_test, target_pred_test = features_pred.loc[index_na,:], target[index_na]
In [190]:
features_pred_train.shape
Out[190]:
In [191]:
features_pred_test.head()
Out[191]:
In [192]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(features_pred_train, target_pred_target)
target_predicted = rf.predict(features_pred_test)
target_predicted_proba = rf.predict_proba(features_pred_test)
In [194]:
print(classification_report(target_pred_test, target_predicted))